Loading Data

spam_train = read.csv("/Users/kevinnguyen/Downloads/spam-train.txt", header = F)
spam_test = read.csv("/Users/kevinnguyen/Downloads/spam-test.txt", header = F)
  1. Standardize the columns so that they all have zero mean and unit variance
normalized_train = data.frame(scale(spam_train[1:57]), spam_train[58])
normalized_test = data.frame(scale(spam_test[1:57]), spam_test[58])
  1. Transform the features using log(xij + 1)
log_train = data.frame(log(spam_train[1:57] + 1), spam_train[58])
log_test = data.frame(log(spam_test[1:57] + 1), spam_test[58])
  1. Discretize each feature using I(xij > 0)
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
ds_train = data.frame(discretizeDF(spam_train[1:57], default = list(method = "fixed", breaks = c(-Inf, 0.00001, Inf), labels=F)), spam_train[58])
ds_train[1:57] = ds_train[1:57]-1
ds_test = data.frame(discretizeDF(spam_test[1:57], default = list(method = "fixed", breaks = c(-Inf, 0.00001, Inf), labels=F)), spam_test[58])
ds_test[1:57] = ds_test[1:57]-1
  1. For each version of the data, visualize it using the tools introduced in the class.
## Normalized Data
for(i in 1:57) {
  if (i-1 %% 9  == 0) {
    par(mfrow=c(3,3))
  }
  plot(normalized_train[,i], normalized_train[,58], main = names(normalized_train)[i])
}

for(i in 1:57) {
  if (i-1 %% 9  == 0) {
    par(mfrow=c(3,3))
  }
  boxplot(normalized_train[,i] ~ normalized_train[,58], main = names(normalized_train)[i])
}

## Log Data
for(i in 1:57) {
  if (i-1 %% 9  == 0) {
    par(mfrow=c(3,3))
  }
  plot(log_train[,i], log_train[,58], main = names(log_train)[i])
}

for(i in 1:57) {
  if (i-1 %% 9  == 0) {
    par(mfrow=c(3,3))
  }
  boxplot(log_train[,i] ~ log_train[,58], main = names(log_train)[i])
}

# Discretized Data
for (i in 1:57){
  counts = table(ds_train[,i], ds_train[,58])
  if (i-1 %% 4 == 0) {
    par(mfrow=c(1,4))
  }
  barplot(counts, main = names(ds_train[i]), beside = T, legend = rownames(counts), xlab = names(ds_train[i]), 
          args.legend=list(title="V58"), ylab = "Frequency", col = c("#E7B800", "#00AFBB"))
}

  1. For each version of the data, fit a logistic regression model. Interpret the results, and report the classification errors on both the training and test sets. Do any of the 57 features/predictors appear to be statistically significant? If so, which ones? (Hint: consider this as a multiple testing problem).
## Normalized Data
lr_norm_train = glm(V58~., data = normalized_train, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lr_norm_train_prob = predict(lr_norm_train, normalized_train, type = "response")
lr_norm_train_predict = ifelse(lr_norm_train_prob > 0.5, 1, 0)

table(lr_norm_train_predict, spam_train$V58)
##                      
## lr_norm_train_predict    0    1
##                     0 1762  133
##                     1   87 1085
# classification error
(lr_norm_train_error = mean(lr_norm_train_predict != spam_train$V58))
## [1] 0.07173133
lr_norm_test_prob = predict(lr_norm_train, normalized_test, type = "response")
lr_norm_test_predict = ifelse(lr_norm_test_prob > 0.5, 1, 0)

table(lr_norm_test_predict, spam_test$V58)
##                     
## lr_norm_test_predict   0   1
##                    0 877  70
##                    1  39 548
# classification error
(lr_norm_test_error = mean(lr_norm_test_predict != spam_test$V58))
## [1] 0.07105606
## Log Transformed Data
lr_log_train = glm(V58~., data = log_train, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lr_log_train_prob = predict(lr_log_train, log_train, type = "response")
lr_log_train_predict = ifelse(lr_log_train_prob > 0.5, 1, 0)

table(lr_log_train_predict, spam_train$V58)
##                     
## lr_log_train_predict    0    1
##                    0 1766   94
##                    1   83 1124
# classification error
(lr_log_train_error = mean(lr_log_train_predict != spam_train$V58))
## [1] 0.05771112
lr_log_test_prob = predict(lr_log_train, log_test, type = "response")
lr_log_test_predict = ifelse(lr_log_test_prob > 0.5, 1, 0)

table(lr_log_test_predict, spam_test$V58)
##                    
## lr_log_test_predict   0   1
##                   0 879  50
##                   1  37 568
# classification error
(lr_log_test_error = mean(lr_log_test_predict != spam_test$V58))
## [1] 0.05671447
## Discretized Data
lr_ds_train = glm(V58~., data = ds_train, family = binomial)

lr_ds_train_prob = predict(lr_ds_train, ds_train, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
lr_ds_train_predict = ifelse(lr_ds_train_prob > 0.5, 1, 0)

table(lr_ds_train_predict, spam_train$V58)
##                    
## lr_ds_train_predict    0    1
##                   0 1779  105
##                   1   70 1113
# classification error
(lr_ds_train_error = mean(lr_ds_train_predict != spam_train$V58))
## [1] 0.05705902
lr_ds_test_prob = predict(lr_ds_train, ds_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
lr_ds_test_predict = ifelse(lr_ds_test_prob > 0.5, 1, 0)

table(lr_ds_test_predict, spam_test$V58)
##                   
## lr_ds_test_predict   0   1
##                  0 859  67
##                  1  57 551
# classification error
(lr_ds_test_error = mean(lr_ds_test_predict != spam_test$V58))
## [1] 0.08083442
  1. Apply both linear and quadratic discriminant analysis methods to the standardized data, and the log transformed data. What are the classification errors (training and test)?
library(MASS)
train_V58 = spam_train$V58
test_V58 = spam_test$V58

Linear Discriminant Analysis: LDA

## Normalized Data
LDA_norm_train = lda(normalized_train[1:57], group = train_V58)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:arules':
## 
##     intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
predict_norm_lda = LDA_norm_train %>% predict(normalized_test[1:57])

# classification error
(LDA_norm_test_error = mean(predict_norm_lda$class!=test_V58))
## [1] 0.1029987
predict_norm_lda = LDA_norm_train %>% predict(normalized_train[1:57])

# classification error
(LDA_norm_train_error = mean(predict_norm_lda$class!=train_V58))
## [1] 0.1017281
## Log Transformed Data
LDA_log_train = lda(log_train[1:57], group = train_V58)

predict_log_lda = LDA_log_train %>% predict(log_test[1:57])

# classification error
(LDA_log_test_error = mean(predict_log_lda$class!=test_V58))
## [1] 0.06518905
predict_log_lda = LDA_log_train %>% predict(log_train[1:57])

# classification error
(LDA_log_train_error = mean(predict_log_lda$class!=train_V58))
## [1] 0.06031953

Quadratic Discriminant Analysis: QDA

## Normalized Data
QDA_norm_train = qda(normalized_train[1:57], group = train_V58)

predict_norm_qda = QDA_norm_train %>% predict(normalized_test[1:57])

# classification error
(QDA_norm_test_error = mean(predict_norm_qda$class!=test_V58))
## [1] 0.1747066
predict_norm_qda = QDA_norm_train %>% predict(normalized_train[1:57])

# classification error
(QDA_norm_train_error = mean(predict_norm_qda$class!=train_V58))
## [1] 0.1786762
## Log Transformed Data
QDA_log_train = qda(log_train[1:57], group = train_V58)

predict_log_qda = QDA_log_train %>% predict(log_test[1:57])

# classification error
(QDA_log_test_error = mean(predict_log_qda$class!=test_V58))
## [1] 0.1571056
predict_log_qda = QDA_log_train %>% predict(log_train[1:57])

# classification error
(QDA_log_train_error = mean(predict_log_qda$class!=train_V58))
## [1] 0.1587871
  1. Apply linear and nonlinear support vector machine classifiers to each version of the data. What are the classification errors (training and test)?
library(e1071)

Normalized Data

normalized_train$V58 = as.factor(normalized_train$V58)
normalized_test$V58 = as.factor(normalized_test$V58)
### Linear SVM
norm_lsvm = svm(formula = V58~., data = normalized_train, kernel = "linear", cost = 1, scale = FALSE)
summary(norm_lsvm)
## 
## Call:
## svm(formula = V58 ~ ., data = normalized_train, kernel = "linear", 
##     cost = 1, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  621
## 
##  ( 315 306 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
norm_ltune = tune(svm, V58~., data = normalized_train, kernel = "linear", ranges = list(cost=c(0.01, 0.1, 1, 10, 100)))
summary(norm_ltune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   100
## 
## - best performance: 0.073374 
## 
## - Detailed performance results:
##    cost      error dispersion
## 1 1e-02 0.08315450 0.01677995
## 2 1e-01 0.07957144 0.01884720
## 3 1e+00 0.07500479 0.01633094
## 4 1e+01 0.07533265 0.01652315
## 5 1e+02 0.07337400 0.01491234
norm_lbestmod = norm_ltune$best.model
summary(norm_lbestmod)
## 
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = normalized_train, 
##     ranges = list(cost = c(0.01, 0.1, 1, 10, 100)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  100 
## 
## Number of Support Vectors:  595
## 
##  ( 310 285 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
norm_lbestmod$cost
## [1] 100
norm_lypred_train = predict(norm_lbestmod, normalized_train)
table(predict = norm_lypred_train, truth = normalized_train$V58)
##        truth
## predict    0    1
##       0 1771  121
##       1   78 1097
# classification error
(norm_train_lsvm_error = mean(norm_lypred_train != train_V58))
## [1] 0.06488425
norm_lypred_test = predict(norm_lbestmod, normalized_test)
table(predict = norm_lypred_test, truth = normalized_test$V58)
##        truth
## predict   0   1
##       0 878  68
##       1  38 550
# classification error
(norm_test_lsvm_error = mean(norm_lypred_test != test_V58))
## [1] 0.06910039
### Non-Linear SVM
norm_nsvm = svm(formula = V58~., data = normalized_train, kernel = "radial", cost = 1, scale = FALSE)
summary(norm_nsvm)
## 
## Call:
## svm(formula = V58 ~ ., data = normalized_train, kernel = "radial", 
##     cost = 1, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  926
## 
##  ( 492 434 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
norm_ntune = tune(svm, V58~., data = normalized_train, kernel = "radial", ranges = list(cost=c(0.01, 0.1, 1, 10, 100, 1000), gamma=c(0.1,0.5,1)))
summary(norm_ntune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10   0.1
## 
## - best performance: 0.06422261 
## 
## - Detailed performance results:
##     cost gamma      error dispersion
## 1  1e-02   0.1 0.39713972 0.02965918
## 2  1e-01   0.1 0.18746673 0.02384839
## 3  1e+00   0.1 0.06846139 0.02081349
## 4  1e+01   0.1 0.06422261 0.02064662
## 5  1e+02   0.1 0.06585233 0.01712869
## 6  1e+03   0.1 0.07074578 0.01336101
## 7  1e-02   0.5 0.39713972 0.02965918
## 8  1e-01   0.5 0.36388516 0.02986606
## 9  1e+00   0.5 0.12324839 0.02473036
## 10 1e+01   0.5 0.11966213 0.02366826
## 11 1e+02   0.5 0.12227119 0.02442247
## 12 1e+03   0.5 0.12325158 0.02715628
## 13 1e-02   1.0 0.39713972 0.02965918
## 14 1e-01   1.0 0.37953099 0.02817976
## 15 1e+00   1.0 0.13597113 0.02877010
## 16 1e+01   1.0 0.13368781 0.02940463
## 17 1e+02   1.0 0.13368781 0.02980286
## 18 1e+03   1.0 0.13466714 0.03154766
norm_nbestmod = norm_ntune$best.model
summary(norm_nbestmod)
## 
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = normalized_train, 
##     ranges = list(cost = c(0.01, 0.1, 1, 10, 100, 1000), gamma = c(0.1, 
##         0.5, 1)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  10 
## 
## Number of Support Vectors:  1369
## 
##  ( 746 623 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
norm_nbestmod$cost
## [1] 10
norm_nbestmod$gamma
## [1] 0.1
norm_nypred_train = predict(norm_nbestmod, normalized_train)
table(predict = norm_nypred_train, truth = normalized_train$V58)
##        truth
## predict    0    1
##       0 1848   15
##       1    1 1203
# classification error
(norm_train_nsvm_error = mean(norm_nypred_train != train_V58))
## [1] 0.005216824
norm_nypred_test = predict(norm_nbestmod, normalized_test)
table(predict = norm_nypred_test, truth = normalized_test$V58)
##        truth
## predict   0   1
##       0 885  65
##       1  31 553
# classification error
(norm_test_nsvm_error = mean(norm_nypred_test != test_V58))
## [1] 0.06258149

Log Data

log_train$V58 = as.factor(log_train$V58)
log_test$V58 = as.factor(log_test$V58)
### Linear SVM
log_lsvm = svm(formula = V58~., data = log_train, kernel = "linear", cost = 1, scale = FALSE)
summary(log_lsvm)
## 
## Call:
## svm(formula = V58 ~ ., data = log_train, kernel = "linear", cost = 1, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  529
## 
##  ( 273 256 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
log_ltune = tune(svm, V58~., data = log_train, kernel = "linear", ranges = list(cost=c(0.001, 0.01, 0.1, 1, 10)))
summary(log_ltune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  0.01
## 
## - best performance: 0.05934406 
## 
## - Detailed performance results:
##    cost      error dispersion
## 1 1e-03 0.06912776 0.01523963
## 2 1e-02 0.05934406 0.01433521
## 3 1e-01 0.06064699 0.01734521
## 4 1e+00 0.06097379 0.01840285
## 5 1e+01 0.06162632 0.01824241
log_lbestmod = log_ltune$best.model
summary(log_lbestmod)
## 
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = log_train, ranges = list(cost = c(0.001, 
##     0.01, 0.1, 1, 10)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
## 
## Number of Support Vectors:  680
## 
##  ( 343 337 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
log_lbestmod$cost
## [1] 0.01
log_lypred_train = predict(log_lbestmod, log_train)
table(predict = log_lypred_train, truth = log_train$V58)
##        truth
## predict    0    1
##       0 1777  109
##       1   72 1109
# classification error
(log_train_lsvm_error = mean(log_lypred_train != train_V58))
## [1] 0.05901532
log_lypred_test = predict(log_lbestmod, log_test)
table(predict = log_lypred_test, truth = log_test$V58)
##        truth
## predict   0   1
##       0 880  53
##       1  36 565
# classification error
(log_test_lsvm_error = mean(log_lypred_test != test_V58))
## [1] 0.05801825
### Non-Linear SVM
log_nsvm = svm(formula = V58~., data = log_train, kernel = "radial", cost = 1, scale = FALSE)
summary(log_nsvm)
## 
## Call:
## svm(formula = V58 ~ ., data = log_train, kernel = "radial", cost = 1, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  877
## 
##  ( 442 435 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
log_ntune = tune(svm, V58~., data = log_train, kernel = "radial", ranges = list(cost=c(0.1, 1, 10, 100, 1000), gamma=c(0.001,0.01,0.1)))
summary(log_ntune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10  0.01
## 
## - best performance: 0.04696089 
## 
## - Detailed performance results:
##     cost gamma      error dispersion
## 1  1e-01 0.001 0.10631028 0.01739720
## 2  1e+00 0.001 0.06326244 0.01051033
## 3  1e+01 0.001 0.06065551 0.01611446
## 4  1e+02 0.001 0.05478700 0.01419978
## 5  1e+03 0.001 0.04826489 0.01183385
## 6  1e-01 0.010 0.06456643 0.01379263
## 7  1e+00 0.010 0.05575994 0.01440965
## 8  1e+01 0.010 0.04696089 0.01503510
## 9  1e+02 0.010 0.04696515 0.01625991
## 10 1e+03 0.010 0.04761449 0.01689666
## 11 1e-01 0.100 0.25431969 0.03166235
## 12 1e+00 0.100 0.07141321 0.01620726
## 13 1e+01 0.100 0.06554896 0.01727559
## 14 1e+02 0.100 0.06619829 0.01737204
## 15 1e+03 0.100 0.06749910 0.01428723
log_nbestmod = log_ntune$best.model
summary(log_nbestmod)
## 
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = log_train, ranges = list(cost = c(0.1, 
##     1, 10, 100, 1000), gamma = c(0.001, 0.01, 0.1)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  10 
## 
## Number of Support Vectors:  609
## 
##  ( 323 286 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
log_nbestmod$cost
## [1] 10
log_nbestmod$gamma
## [1] 0.01
log_nypred_train = predict(log_nbestmod, log_train)
table(predict = log_nypred_train, truth = log_train$V58)
##        truth
## predict    0    1
##       0 1826   45
##       1   23 1173
# classification error
(log_train_nsvm_error = mean(log_nypred_train != train_V58))
## [1] 0.0221715
log_nypred_test = predict(log_nbestmod, log_test)
table(predict = log_nypred_test, truth = log_test$V58)
##        truth
## predict   0   1
##       0 892  34
##       1  24 584
# classification error
(log_test_nsvm_error = mean(log_nypred_test != test_V58))
## [1] 0.03780965

Discretized Data

ds_train$V58 = as.factor(ds_train$V58)
ds_test$V58 = as.factor(ds_test$V58)
### Linear SVM
ds_lsvm = svm(formula = V58~., data = ds_train, kernel = "linear", cost = 1, scale = FALSE)
summary(ds_lsvm)
## 
## Call:
## svm(formula = V58 ~ ., data = ds_train, kernel = "linear", cost = 1, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  560
## 
##  ( 280 280 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
ds_ltune = tune(svm, V58~., data = ds_train, kernel = "linear", ranges = list(cost=c(0.01, 0.1, 1, 10, 100)))
summary(ds_ltune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.06652509 
## 
## - Detailed performance results:
##    cost      error dispersion
## 1 1e-02 0.07695174 0.01134329
## 2 1e-01 0.06945349 0.01435534
## 3 1e+00 0.06652509 0.01247640
## 4 1e+01 0.06913628 0.01534461
## 5 1e+02 0.06946094 0.01407189
ds_lbestmod = ds_ltune$best.model
summary(ds_lbestmod)
## 
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = ds_train, ranges = list(cost = c(0.01, 
##     0.1, 1, 10, 100)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  560
## 
##  ( 280 280 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
ds_lbestmod$cost
## [1] 1
ds_lypred_train = predict(ds_lbestmod, ds_train)
table(predict = ds_lypred_train, truth = ds_train$V58)
##        truth
## predict    0    1
##       0 1780  116
##       1   69 1102
# classification error
(ds_train_lsvm_error = mean(ds_lypred_train != train_V58))
## [1] 0.06031953
ds_lypred_test = predict(ds_lbestmod, ds_test)
table(predict = ds_lypred_test, truth = ds_test$V58)
##        truth
## predict   0   1
##       0 865  63
##       1  51 555
# classification error
(ds_test_lsvm_error = mean(ds_lypred_test != test_V58))
## [1] 0.07431551
### Non-Linear SVM
ds_nsvm = svm(formula = V58~., data = ds_train, kernel = "radial", cost = 1, scale = FALSE)
summary(ds_nsvm)
## 
## Call:
## svm(formula = V58 ~ ., data = ds_train, kernel = "radial", cost = 1, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  814
## 
##  ( 411 403 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
ds_ntune = tune(svm, V58~., data = ds_train, kernel = "radial", ranges = list(cost=c(0.1, 1, 10, 100, 1000), gamma=c(0.1,0.5,1,1.5)))
summary(ds_ntune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   100   0.5
## 
## - best performance: 0.04466266 
## 
## - Detailed performance results:
##     cost gamma      error  dispersion
## 1  1e-01   0.1 0.07107258 0.012037349
## 2  1e+00   0.1 0.05835728 0.009856153
## 3  1e+01   0.1 0.04694599 0.011796695
## 4  1e+02   0.1 0.05215665 0.014451322
## 5  1e+03   0.1 0.05248025 0.015197744
## 6  1e-01   0.5 0.12227012 0.027584807
## 7  1e+00   0.5 0.04499053 0.011037153
## 8  1e+01   0.5 0.04498733 0.012897975
## 9  1e+02   0.5 0.04466266 0.011680089
## 10 1e+03   0.5 0.04466266 0.011680089
## 11 1e-01   1.0 0.36585233 0.032682283
## 12 1e+00   1.0 0.10759618 0.021438712
## 13 1e+01   1.0 0.10010006 0.019918645
## 14 1e+02   1.0 0.10010006 0.019918645
## 15 1e+03   1.0 0.10010006 0.019918645
## 16 1e-01   1.5 0.37530604 0.029262410
## 17 1e+00   1.5 0.12324200 0.022104985
## 18 1e+01   1.5 0.11867855 0.020872856
## 19 1e+02   1.5 0.11867855 0.020872856
## 20 1e+03   1.5 0.11867855 0.020872856
ds_nbestmod = ds_ntune$best.model
summary(ds_nbestmod)
## 
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = ds_train, ranges = list(cost = c(0.1, 
##     1, 10, 100, 1000), gamma = c(0.1, 0.5, 1, 1.5)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  100 
## 
## Number of Support Vectors:  1475
## 
##  ( 756 719 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
ds_nbestmod$cost
## [1] 100
ds_nbestmod$gamma
## [1] 0.5
ds_nypred_train = predict(ds_nbestmod, ds_train)
table(predict = ds_nypred_train, truth = ds_train$V58)
##        truth
## predict    0    1
##       0 1842   14
##       1    7 1204
# classification error
(ds_train_nsvm_error = mean(ds_nypred_train != train_V58))
## [1] 0.006847082
ds_nypred_test = predict(ds_nbestmod, ds_test)
table(predict = ds_nypred_test, truth = ds_test$V58)
##        truth
## predict   0   1
##       0 889  35
##       1  27 583
# classification error
(ds_test_nsvm_error = mean(ds_nypred_test != test_V58))
## [1] 0.04041721

Report classification errors using different methods and different preprocessed data in a table, and comment on the different performances.

error_matrix = matrix(data = NA, nrow = 5, ncol = 6)
colnames(error_matrix) = c("Normal train", "Normal test", "Log Train", "Log Test", 
                          "Discretized Train", "Discretized Test")
rownames(error_matrix) = c("Logistic Regression", "LDA", "QDA", "SVM Linear", "SVM Gaussian")
error_matrix
##                     Normal train Normal test Log Train Log Test
## Logistic Regression           NA          NA        NA       NA
## LDA                           NA          NA        NA       NA
## QDA                           NA          NA        NA       NA
## SVM Linear                    NA          NA        NA       NA
## SVM Gaussian                  NA          NA        NA       NA
##                     Discretized Train Discretized Test
## Logistic Regression                NA               NA
## LDA                                NA               NA
## QDA                                NA               NA
## SVM Linear                         NA               NA
## SVM Gaussian                       NA               NA
error_matrix[1,1] = lr_norm_train_error
error_matrix[1,2] = lr_norm_test_error
error_matrix[1,3] = lr_log_train_error
error_matrix[1,4] = lr_log_test_error
error_matrix[1,5] = lr_ds_train_error
error_matrix[1,6] = lr_ds_test_error
error_matrix[2,1] = LDA_norm_train_error
error_matrix[2,2] = LDA_norm_test_error
error_matrix[2,3] = LDA_log_train_error
error_matrix[2,4] = LDA_log_test_error
error_matrix[2,5] = NA
error_matrix[2,6] = NA
error_matrix[3,1] = QDA_norm_train_error
error_matrix[3,2] = QDA_norm_test_error
error_matrix[3,3] = QDA_log_train_error
error_matrix[3,4] = QDA_log_test_error
error_matrix[3,5] = NA
error_matrix[3,6] = NA
error_matrix[4,1] = norm_train_lsvm_error
error_matrix[4,2] = norm_test_lsvm_error
error_matrix[4,3] = log_train_lsvm_error
error_matrix[4,4] = log_test_lsvm_error
error_matrix[4,5] = ds_train_lsvm_error
error_matrix[4,6] = ds_test_lsvm_error
error_matrix[5,1] = norm_train_nsvm_error
error_matrix[5,2] = norm_test_nsvm_error
error_matrix[5,3] = log_train_nsvm_error
error_matrix[5,4] = log_test_nsvm_error
error_matrix[5,5] = ds_train_nsvm_error
error_matrix[5,6] = ds_test_nsvm_error
error_matrix
##                     Normal train Normal test  Log Train   Log Test
## Logistic Regression  0.071731334  0.07105606 0.05771112 0.05671447
## LDA                  0.101728073  0.10299870 0.06031953 0.06518905
## QDA                  0.178676231  0.17470665 0.15878709 0.15710561
## SVM Linear           0.064884252  0.06910039 0.05901532 0.05801825
## SVM Gaussian         0.005216824  0.06258149 0.02217150 0.03780965
##                     Discretized Train Discretized Test
## Logistic Regression       0.057059015       0.08083442
## LDA                                NA               NA
## QDA                                NA               NA
## SVM Linear                0.060319530       0.07431551
## SVM Gaussian              0.006847082       0.04041721

The lowest testing classification error rate among the models and different preprocessed data was with the gaussian non-linear support vector machine model based on the log(xij + 1) transformation. It was reported at around 3.5%. All of the other testing classification errors hovered around 6%.

The lowest training classification error rate among the models and different preprocessed data was with the gaussian non-linear support vector machine model based on the normalized data. It was reported at <1%. All of the other training classification errors ranged from <1% to 17.8%.

Finally, use either a single method with properly chosen tuning parameter or a combination of several methods to design a classifier with test error rate as small as possible. Describe your recommended method, and report its performance.

# refer to Log transformed data Non-Linear SVM
best_preprocessed_data_train = log_train
best_preprocessed_data_test = log_test
best_method_model = log_nbestmod
summary(best_method_model)
## 
## Call:
## best.tune(method = svm, train.x = V58 ~ ., data = log_train, ranges = list(cost = c(0.1, 
##     1, 10, 100, 1000), gamma = c(0.001, 0.01, 0.1)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  10 
## 
## Number of Support Vectors:  609
## 
##  ( 323 286 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
# classification error
log_test_nsvm_error
## [1] 0.03780965

Earlier in the code, we tuned the nonlinear SVM model based on all of the preprocessed data and found that the log transformed data with a cost = 10 and a gamma = 0.01 was the model that provided the smallest test error rate. We found that the testing error was reported at around 3.5%. The nonlinear SVM model was based off the gaussian or radial kernel which made the boundary line between the classes more like a circle or oval. This indicated that the data was not linearly separable.